This is a demo for class SURV 622/SURVMETH 622. It contains two sections: Bag of words model and Word2Vec model for representation learning.
#Load Packages
#install.packages(c('tm','word2vec','udpipe', 'uwot','glmnet';))
library(tm)
## Loading required package: NLP
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(word2vec)
library(uwot)
## Loading required package: Matrix
library(glmnet)
## Loaded glmnet 4.1-3
cos.sim <- function(a,b)
{
return( sum(a*b)/sqrt(sum(a^2)*sum(b^2)) )
}
Data are downloaded from Kaggle (https://www.kaggle.com/datasets/kazanova/sentiment140). The tweets have been annotated (0 = negative, 4 = positive) and they can be used to detect sentiment.
tweets = read.csv("training.100000.processed.noemoticon.csv",stringsAsFactors=FALSE, encoding = "UTF-8")
The data has been prepossessed. Let’s take a look.
head(tweets)
## target ids date flag user
## 1 4 2053333385 Sat Jun 06 04:10:15 PDT 2009 NO_QUERY meggz15
## 2 0 1964093465 Fri May 29 13:42:51 PDT 2009 NO_QUERY BattleBabeeyx
## 3 4 1980616407 Sun May 31 07:13:44 PDT 2009 NO_QUERY mrswilliams815
## 4 4 2174460574 Sun Jun 14 22:35:57 PDT 2009 NO_QUERY abbiereyes
## 5 0 1754448486 Sun May 10 06:18:29 PDT 2009 NO_QUERY cradow
## 6 4 2174736346 Sun Jun 14 23:11:51 PDT 2009 NO_QUERY aurattii
## text
## 1 is spendingg time with ness
## 2 @buckhollywood aw it was so sad shes too cute!
## 3 Back to Barstow today!!
## 4 @thedailysurvey vote ko po touch my hand. thanks po
## 5 needs Twilight! I feel so addicted.
## 6 @mungob yay nice to hear that u had a good day 4 a change cudn't happened 2 a more deservin MG
sum(is.na(tweets$target))
## [1] 0
sum(is.na(tweets$ids))
## [1] 0
sum(is.na(tweets$text))
## [1] 0
This line is to address some encoding error.
tweets$text = iconv(tweets$text,"WINDOWS-1252","UTF-8")
Prepossessing Road map: 1. lower all character 2. remove numbers 3. remove punctuation 4. remove stop words
tweet_corpus = Corpus(VectorSource(tweets$text))
tweet_corpus = tm_map(tweet_corpus, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(tweet_corpus, content_transformer(tolower)):
## transformation drops documents
tweet_corpus = tm_map(tweet_corpus, removeNumbers)
## Warning in tm_map.SimpleCorpus(tweet_corpus, removeNumbers): transformation
## drops documents
tweet_corpus = tm_map(tweet_corpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(tweet_corpus, removePunctuation): transformation
## drops documents
tweet_corpus = tm_map(tweet_corpus, removeWords, c("the", "and", stopwords("english")))
## Warning in tm_map.SimpleCorpus(tweet_corpus, removeWords, c("the", "and", :
## transformation drops documents
tweet_corpus = tm_map(tweet_corpus, stripWhitespace)
## Warning in tm_map.SimpleCorpus(tweet_corpus, stripWhitespace): transformation
## drops documents
tweet_dtm = DocumentTermMatrix(tweet_corpus)
tweet_dtm
## <<DocumentTermMatrix (documents: 100000, terms: 106647)>>
## Non-/sparse entries: 729556/10663970444
## Sparsity : 100%
## Maximal term length: 141
## Weighting : term frequency (tf)
Take a quick look at first 15 documents and terms.
inspect(tweet_dtm[1:15, 1:15])
## <<DocumentTermMatrix (documents: 15, terms: 15)>>
## Non-/sparse entries: 19/206
## Sparsity : 92%
## Maximal term length: 14
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs back barstow buckhollywood cute ness sad shes spendingg time today
## 1 0 0 0 0 1 0 0 1 1 0
## 15 0 0 0 0 0 0 0 0 1 1
## 2 0 0 1 1 0 1 1 0 0 0
## 3 1 1 0 0 0 0 0 0 0 1
## 4 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0
## 7 0 0 0 0 0 0 0 0 0 0
## 8 1 0 0 0 0 0 0 0 0 1
## 9 0 0 0 0 0 0 0 0 0 0
Drop most of the rare terms to make the matrix dense.
tweet_dtm = removeSparseTerms(tweet_dtm, 0.99)
tweet_dtm
## <<DocumentTermMatrix (documents: 100000, terms: 88)>>
## Non-/sparse entries: 194642/8605358
## Sparsity : 98%
## Maximal term length: 8
## Weighting : term frequency (tf)
tweet_svd = svd(tweet_dtm)
heatmap(tweet_svd$u[1:15,1:15])
We will build a Word2Vec by ourselves using CBOW architecture in the context of tweets we have seen.
set.seed(1015)
model = word2vec(x = tweets$text, type = "cbow", dim = 15, iter = 10)
embedding = as.matrix(model)
viz = umap(embedding, n_neighbors = 15, n_threads = 2)
df = data.frame(word = gsub("//.+", "", rownames(embedding)),
xpos = gsub(".+//", "", rownames(embedding)),
x = viz[, 1], y = viz[, 2],
stringsAsFactors = FALSE)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
plot_ly(df[500:700,], x = ~x, y = ~y, type = "scatter", mode = 'text', text = ~word)
model = read.word2vec("18/model.bin", normalize = TRUE)
predict(model, newdata = c("fries", "money"), type = "nearest", top_n = 5)
## $fries
## term1 term2 similarity rank
## 1 fries french 0.9076371 1
## 2 fries burgers 0.8938329 2
## 3 fries hamburgers 0.8827033 3
## 4 fries sandwiches 0.8591300 4
## 5 fries fried 0.8587483 5
##
## $money
## term1 term2 similarity rank
## 1 money monies 0.8541762 1
## 2 money funds 0.8506691 2
## 3 money cash 0.8144652 3
## 4 money sums 0.8034646 4
## 5 money fund 0.7885280 5
wv = predict(model, newdata = c("king", "man", "woman"), type = "embedding")
wv = wv["king", ] - wv["man", ] + wv["woman", ]
predict(model, newdata = wv, type = "nearest", top_n = 3)
## term similarity rank
## 1 king 0.9670773 1
## 2 queen 0.9044872 2
## 3 monarch 0.8917280 3
Let check the word nurse will more similar to man or woman
wv_woman = predict(model, newdata = c("woman"), type = "embedding")
wv_man = predict(model, newdata = c("man"), type = "embedding")
wv_nurse = predict(model, newdata = c("nurse"), type = "embedding")
cos.sim(wv_woman, wv_nurse)
## [1] 0.5568123
cos.sim(wv_man, wv_nurse)
## [1] 0.3725856
tweets[tweets$target==4,'target'] = 1
X = as.matrix(tweet_dtm)
X_train = X[1:80000,]
X_test = X[80001:100000,]
y_train = tweets$target[1:80000]
y_test = tweets$target[80001:100000]
Calling cross validation to test best hyper parameter (lambda)
cv_model = cv.glmnet(X_train, y_train, alpha = 1,family='binomial')
best_lambda <- cv_model$lambda.min
chose the best lambda value
best_model = glmnet(X_train, y_train, alpha = 1, lambda = best_lambda)
coef(best_model)
## 89 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 0.509615405
## time 0.020775331
## sad -0.367571290
## back -0.041753708
## today -0.049851972
## thanks 0.301470798
## feel -0.172824045
## day 0.003479450
## good 0.159719444
## nice 0.208868687
## much -0.056079161
## never -0.129614258
## thank 0.314303077
## dont -0.120005905
## bad -0.259986504
## love 0.207308429
## now -0.029410892
## see 0.064532395
## can 0.066355604
## make 0.022973075
## think -0.011464786
## tomorrow -0.028943130
## amp 0.030020502
## fun 0.130560344
## get -0.031541247
## way 0.014475582
## want -0.146190158
## one -0.009686349
## haha 0.141985881
## really -0.072731958
## thats 0.033111237
## come -0.003404260
## didnt -0.170698351
## great 0.205186837
## last -0.128196002
## night 0.046721516
## happy 0.244948277
## just 0.012958240
## little .
## though -0.107214118
## week -0.030377983
## sorry -0.280943483
## watching 0.150672116
## lol 0.093223352
## hope 0.011095269
## know 0.015207428
## need -0.110259071
## working -0.119563625
## wait 0.216694965
## even -0.042661806
## people .
## new 0.112382486
## next -0.009084835
## weekend -0.006243249
## miss -0.320268474
## tonight 0.020843349
## will 0.062331529
## still -0.152095628
## cant -0.209224245
## yes 0.139856060
## work -0.156535604
## got -0.018286598
## well 0.032721052
## days .
## long -0.081132273
## gonna -0.022229080
## first 0.098387792
## ive -0.052330492
## youre 0.168850403
## wish -0.305522451
## bed -0.020719139
## sleep -0.089245488
## best 0.150142444
## right .
## better 0.062316862
## twitter 0.065944365
## hey 0.181833899
## home -0.077322460
## hate -0.309012170
## like 0.001146549
## going 0.001167402
## morning 0.054679125
## awesome 0.221887130
## take .
## ill 0.068789499
## getting -0.017765084
## yeah 0.055115221
## soon 0.016151073
## school -0.047642548
coef = as.data.frame(as.matrix(coef(best_model)))
coef = cbind(coef,term = row.names(coef))
coef = coef[order(coef$s0),]
ggplot(coef,
aes(x = reorder(term, s0), y = s0)) +
geom_point() +
coord_flip() +
theme_bw()
Using rmse as test metrics
y_predicted <- predict(best_model, s = best_lambda, newx = X_test)
y_predicted <- ifelse(y_predicted>0.5,1,0)
sum(y_test == y_predicted)/20000
## [1] 0.64455